library(glmnet)
library(dplyr)
library(tidyverse)
library(edgeR)
library(limma)
library(AnnotationHub)
library(ensembldb)
library(biomaRt)
library(dbplyr) #must load this package to make sure AnnotationHub can grep the database
library(DT)
library(survival)
library(ggpubr)
library(survminer)
library(rstatix)
library(ezcox)TCGA_meta_data<-read_tsv(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/TCGA/gdc_sample_sheet.2024-02-06.tsv", col_names = T)
TCGA_rna_read_counts_files<-list.files(path = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/TCGA/read_count",
pattern = "*counts.tsv",
recursive = T,
full.names = T)
meta_data<-as.data.frame(TCGA_meta_data)[, c("File Name", "Sample ID", "Case ID", "Sample Type")]
tumor_meta_data<-meta_data[meta_data$`Sample Type` !="Solid Tissue Normal", ]
read_count<-as.data.frame(read.table(TCGA_rna_read_counts_files[1], sep = "\t", header = T))
read_count<-read_count[5:nrow(read_count), c("gene_id", "gene_name", "unstranded")]
colnames(read_count)<-c("gene_id", "gene_name", basename(TCGA_rna_read_counts_files[1]))
for (i in 2:length(TCGA_rna_read_counts_files)){
file_name<-basename(TCGA_rna_read_counts_files[i])
df<-read.table(TCGA_rna_read_counts_files[i], sep = "\t", header = T)
df<-df[5:nrow(df), c("gene_id", "gene_name", "unstranded")]
colnames(df)<-c("gene_id", "gene_name", file_name)
read_count<-left_join(read_count, df, by = c("gene_id", "gene_name"))
}
read_count<-readRDS("/xdisk/mpadi/jiawenyang/data/centrosome_loss/TCGA/read_count_unstranded_TCGA.rds")
samples<-colnames(read_count)[3:length(colnames(read_count))]
tumor_samples<-samples[samples %in% tumor_meta_data$`File Name`]
tumor_matrix<-read_count[, c("gene_id","gene_name",tumor_meta_data$`File Name`)]
colnames(tumor_matrix)<-c("gene_id", "gene_name", tumor_meta_data$`Sample ID`)
dim(tumor_matrix)
# Normalize gene expression matrix.
group<-factor(colnames(tumor_matrix)[3:length(colnames(tumor_matrix))])
counts<-tumor_matrix[,3:ncol(tumor_matrix)]
rownames(counts)<-tumor_matrix$gene_id
y <-DGEList(counts = counts, group = group)
keep <- rowSums(cpm(y) > 1) >= length(group)/2
y <- y[keep, keep.lib.sizes = FALSE]
y <- calcNormFactors(y, method = "TMM")
design <- model.matrix(~0 + group)
colnames(design)<-gsub("group", "", colnames(design))
temp <- voom(y, design, plot = T)
edat <- temp$E
edat <- as.data.frame(edat)sig3_patient<-substr(sig3_patient, 1, nchar(sig3_patient)-3)
edat<-readRDS(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/TCGA/tcga_prad_tumor_edat.rds")
colnames(edat)<-unlist(lapply(colnames(edat), function(x) substr(x, 1, 12)))library(glmnet)
labels<-ifelse(colnames(edat) %in% sig3_patient, 1, 0)
#prepare data
x = as.matrix(t(edat))
y = labels
#perform lasso regression with cross-validation
set.seed(1234)
lasso_model<-cv.glmnet(x, y, alpha = 1, nfolds = 10) #ALPHA = 1 for lasso
#Extract selected features (genes) based on the optimal lambda
selected_genes<-coef(lasso_model, s = "lambda.min")
non_zero_genes <- rownames(selected_genes)[which(selected_genes != 0)]
ensembl<-useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
ensemblId<-non_zero_genes[-1] #exclude intercept
ensemblId<-unlist(lapply(ensemblId, function(x) strsplit(x, "[.]")[[1]][1]))
library(dbplyr)
signature_gene_annotation<-biomaRt::getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol","chromosome_name","band","description","start_position","end_position", "entrezgene_id"),
values=ensemblId, mart=ensembl)
signature_gene_annotation<-signature_gene_annotation[signature_gene_annotation$hgnc_symbol != "",]
DT::datatable(signature_gene_annotation)CIN_genes<-readxl::read_excel("/xdisk/mpadi/jiawenyang/result/centrosome_loss/lasso_signature_genes/CIN_signature_genes.xlsx")
#CIN70
CIN70<-CIN_genes$CIN70 #Orginal list from published paper: https://www.nature.com/articles/ng1861. Need to adjust gene names to their formal human gene symbol.
CIN70[61]<-"GPI"
CIN70[4]<-"CDCA2" #Cell division cycle-associated protein, was CDC2 in the list.
CIN70[13]<-"CEP250" #Centrosome Nek2-associated protein 1, was CNAP1
CIN70[18]<-"CDC45" #Cell division cycle 45 -like was CDC45L
CIN70[46]<-"NCAPH" #Non-SMC condensin I complex subunit H. was BRRN1
CIN70[49]<-"NCAPG2" #Non-SMC condensin II complex subunit G2. was MTB
CIN70[52]<-"PBK" #PDZ binding kinase gene was TOPK
CIN70[62]<-"SRSF2" #Serine And arginine rich splicing factor 2.was SFRS2
CIN70[67]<-"AURKA" #Aurora kinase A.was STK6
CIN70[69]<-"TMEM194A" #Nuclear envelope integral membrane protein 1.was KIAA0286
#CIN25
CIN25<-na.omit(CIN_genes$CIN25)#Orginal list from published paper: https://www.nature.com/articles/ng1861.
CIN25[3]<-"CDC45" #Cell division cycle 45 -like was CDC45L
CIN25[6]<-"CDK1" #Cyclin dependent kinease 1, was CDC2.
CIN25[14]<-"CEP250" #Centrosome Nek2-associated protein 1, was CNAP1
#CA20
CA20<-na.omit(CIN_genes$CA20) #gene names are all formal gene symbol.
#CIN23
CIN23<-na.omit(CIN_genes$CIN23) #gene names are all formal gene symbol.#GTF2IP7 and CD24 are not included in the dataset
#CIN7
CIN7<-na.omit(CIN_genes$CIN7) #gene names are all formal gene symbol.
#CN-sig3
CN_sig3<-na.omit(CIN_genes$`CN-sig3 genes_with_All_TCGA_prad_genes`) #gene names are all formal gene symbol.dkfz_df<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/DKFZ_2018/data_mrna_seq_rpkm_zscores_ref_all_samples.txt",sep = "\t", header = T)
zdat<-dkfz_df[,c(1,3:ncol(dkfz_df))]
sample_metadata<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/DKFZ_2018/data_clinical_sample.txt", sep = "\t", header = T)
patient_metadata<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/DKFZ_2018/data_clinical_patient.txt", sep = "\t", header = T)
CIN.gene.set<-c("CIN70", "CIN25", "CIN23", "CIN7", "CA20", "CN_sig3")
DKFZ_CIN_zscore_matrix<-list()
DKFZ_CIN_clincal_matrix<-list()
for (i in 1:length(CIN.gene.set)){
CIN.name<-CIN.gene.set[i]
cn_signature_genes <- get(CIN.gene.set[i])
subzdat<-zdat[zdat$Hugo_Symbol %in% cn_signature_genes,]
clinical_metadata<-left_join(sample_metadata, patient_metadata, by = "PATIENT_ID")
duplicated_samples<-clinical_metadata[duplicated(clinical_metadata$PATIENT_ID), "SAMPLE_ID"]
subzdat<-subzdat[, !colnames(subzdat) %in% intersect(duplicated_samples, colnames(subzdat))]
altered_group<-c()
for (k in 1:nrow(subzdat)){
sample_id<-colnames(subzdat[,2:ncol(subzdat)])[which(abs(subzdat[k, 2:ncol(subzdat)]) >= 2)] #absolute z-score greater than 2 is defined as altered.
altered_group<-c(altered_group, sample_id)
}
altered_group<-unique(altered_group)
unaltered_group<-unique(colnames(subzdat)[-1][!colnames(subzdat)[-1] %in% altered_group])
subzdat_ordered<-subzdat[, c(altered_group, unaltered_group)]
rownames(subzdat_ordered)<-subzdat$Hugo_Symbol
DKFZ_CIN_zscore_matrix[[CIN.name]]<-subzdat_ordered
mrnaseq_clinical_metadata<-clinical_metadata[clinical_metadata$SAMPLE_ID %in% colnames(subzdat)[2:ncol(subzdat)],]
alt_sample_metadata<-mrnaseq_clinical_metadata[mrnaseq_clinical_metadata$SAMPLE_ID %in% unique(altered_group), ]
unalt_sample_metadata<-mrnaseq_clinical_metadata[!mrnaseq_clinical_metadata$SAMPLE_ID %in% unique(altered_group),]
alt_sample_metadata[, "group"]<-rep("altered", nrow(alt_sample_metadata))
unalt_sample_metadata[,"group"]<-rep("unaltered", nrow(unalt_sample_metadata))
clinical_data<-rbind(alt_sample_metadata, unalt_sample_metadata)
DKFZ_CIN_clincal_matrix[[CIN.name]]<-clinical_data
clinical_data_KM<-clinical_data
BCR_curve<-Surv(clinical_data_KM$TIME_FROM_SURGERY_TO_BCR_LASTFU, clinical_data_KM$BCR_STATUS)
BCR_Sfit<-survfit(Surv(TIME_FROM_SURGERY_TO_BCR_LASTFU, BCR_STATUS)~group, data = clinical_data_KM)
dkfz_prad_relapse<-ggsurvplot(BCR_Sfit,
conf.int = F,
pval = T,
risk.table = T,
legend.labs = c("altered", "unaltered"),
legend.title = "CIN signature genes",
xlab = "Time (month)",
title = paste0("DKFZ Prostate cancer patients BCR-free survival \n ", CIN.name, " alteration"),
palette = c("orange", "#5BC0EB"),
font.tickslab = c(15),
risk.table.height = .25)
print(dkfz_prad_relapse)
}#GLEASON score
library(RColorBrewer)
col_values = rev(brewer.pal(n = 7, name = "RdBu"))
col_values[4] <- "#e7e689"
clinical_data_gleason<-clinical_data %>% group_by(group) %>% dplyr::count(GLEASON_SCORE)
GLEASON_df<-data.frame("altered" = c(1, 8, 5, 0, 4, 5, 0), "unaltered" = c(12, 48, 6, 1, 2, 1, 1),
row.names = c("3+3", "3+4", "4+3", "4+4", "4+5", "5+4", "5+5"))
p_gleason <- ggplot(data=clinical_data_gleason,
aes(x=factor(group, levels = c("altered", "unaltered")),
y=n,
fill=GLEASON_SCORE)) +
scale_fill_manual(labels = c("3+3", "3+4", "4+3", "4+4", "4+5", "5+4", "5+5"), values=col_values) +
geom_bar(position = "fill", stat="identity", width = 0.6) +
geom_text(aes(label = n), size = 3, hjust = 0.5, vjust = 3, position = "fill") +
xlab("sample groups") +
ylab("Percentage of each sample type") +
ggtitle("Associate CIN signatures genes expression with patients PCa grades") +
theme_bw () +
theme(title = element_text(face = "bold"),
legend.title = element_text(face = "bold"),
legend.text = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"),
axis.title.x = element_text(face = "bold"),
strip.text.x = element_blank(),
strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
legend.position = "right",
panel.spacing = unit(0,'lines'))
p_gleason#GLEASON score
clinical_data_gleason<-clinical_data %>% group_by(group) %>% dplyr::count(GLEASON_SCORE)
GLEASON_df<-data.frame("altered" = c(1, 8, 5, 0, 4, 5, 0), "unaltered" = c(12, 48, 6, 1, 2, 1, 1),
row.names = c("3+3", "3+4", "4+3", "4+4", "4+5", "5+4", "5+5"))
print(fisher.test(GLEASON_df))##
## Fisher's Exact Test for Count Data
##
## data: GLEASON_df
## p-value = 0.000167
## alternative hypothesis: two.sided
p_purity<-ggplot(data = clinical_data,
aes(x = factor( group, levels = c("altered", "unaltered")),
y = MEDIAN_PURITY,
fill = group,
color = group)) +
theme_light() +
geom_boxplot(alpha = 0.4) +
geom_jitter(shape = 16, position = position_jitter(0.2)) +
scale_fill_manual(values = c("altered" = "orange", "unaltered" = "#5BC0EB")) +
scale_color_manual(values = c("altered" = "orange", "unaltered" = "#5BC0EB")) +
stat_compare_means(label.y =70, label.x = 2, method = "t.test")+
labs(title="Associate CIN signatures genes expression with PCa sample purity", x="patient groups", y = "sample purity")+
theme(plot.title = element_text(size=20, face = "bold"),
axis.text.x = element_text(size = 10, face = "bold"), axis.text.y = element_text(size = 10, face = "bold"),
axis.title.x = element_text(size = 15, face = "bold"), axis.title.y = element_text(size = 15, face = "bold"),
legend.text = element_text(size = 10, face = "bold"), legend.title = element_text(size = 15, face = "bold"))
p_purityprad_mskcc<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_mskcc/data_mrna_agilent_microarray_zscores_ref_diploid_samples.txt", sep = "\t", header = T)
Entrez_id<-prad_mskcc$Entrez_Gene_Id
genes_annotation<-biomaRt::getBM(filters= "entrezgene_id", attributes= c("ensembl_gene_id","hgnc_symbol","chromosome_name","band","description","start_position","end_position", "entrezgene_id"),
values=Entrez_id, mart=ensembl)
zdat<-left_join(prad_mskcc, genes_annotation, by = c("Entrez_Gene_Id" = "entrezgene_id"))zdat<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/lasso_signature_genes/prad_mskcc_zdat.rds")
sample_metadata<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_mskcc/data_clinical_sample.txt", sep = "\t", header = T)
patient_metadata<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_mskcc/data_clinical_patient.txt", sep = "\t", header = T)
clinical_metadata<-left_join(sample_metadata, patient_metadata, by = "PATIENT_ID")
MSKCC_CIN_zscore_matrix<-list()
MSKCC_CIN_clincal_matrix<-list()
for(i in 1:length(CIN.gene.set)){
CIN.name<-CIN.gene.set[i]
cn_signature_genes <- get(CIN.gene.set[i])
subzdat<-zdat[zdat$hgnc_symbol %in% cn_signature_genes, c(153, 2:151)] #variables
subzdat<-subzdat[!duplicated(subzdat$hgnc_symbol),]
altered_group<-c()
for (k in 1:nrow(subzdat)){
sample_id<-colnames(subzdat[,2:ncol(subzdat)])[which(abs(subzdat[k, 2:ncol(subzdat)]) >= 3)] #absolute z-score greater than 3 is defined as altered.
altered_group<-c(altered_group, sample_id)
}
altered_group<-unique(altered_group)
unaltered_group<-unique(colnames(subzdat)[-1][!colnames(subzdat)[-1] %in% altered_group])
subzdat_ordered<-subzdat[, c(altered_group, unaltered_group)]
rownames(subzdat_ordered)<-subzdat$hgnc_symbol
MSKCC_CIN_zscore_matrix[[CIN.name]]<-subzdat_ordered
mrnaseq_clinical_metadata<-clinical_metadata[clinical_metadata$SAMPLE_ID %in% colnames(zdat)[2:nrow(zdat)],]
alt_sample_metadata<-mrnaseq_clinical_metadata[mrnaseq_clinical_metadata$SAMPLE_ID %in% unique(altered_group), ]
unalt_sample_metadata<-mrnaseq_clinical_metadata[!mrnaseq_clinical_metadata$SAMPLE_ID %in% unique(altered_group),]
alt_sample_metadata[, "group"]<-rep("altered", nrow(alt_sample_metadata))
unalt_sample_metadata[,"group"]<-rep("unaltered", nrow(unalt_sample_metadata))
clinical_data<-rbind(alt_sample_metadata, unalt_sample_metadata)
clinical_data$DFS_STATUS<-as.numeric(substr(clinical_data$DFS_STATUS, 1, 1))
MSKCC_CIN_clincal_matrix[[CIN.name]]<-clinical_data
#kaplan meier analysis
clinical_data_KM<-clinical_data[!is.na(clinical_data$DFS_STATUS),]
DF_curve<-Surv(clinical_data_KM$DFS_MONTHS, clinical_data_KM$DFS_STATUS)
DF_Sfit<-survfit(Surv(DFS_MONTHS, DFS_STATUS)~group, data = clinical_data_KM)
mskcc_prad_relapse<-ggsurvplot(DF_Sfit,
conf.int = F,
pval = T,
risk.table = T,
legend.labs = c("altered", "unaltered"),
legend.title = "CIN signature genes",
xlab = "Time (month)",
title = paste0("MSKCC Prostate cancer patients Disease-free survival \n ", CIN.name, " alteration"),
palette = c("orange", "#5BC0EB"),
font.tickslab = c(15),
risk.table.height = .25)
print(mskcc_prad_relapse)
}library(RColorBrewer)
# Associate with Gleason Score
clinica_data_gleason<-clinical_data
clinica_data_gleason[, "GLEASON_SCORE_BOTH_SITES"]<-paste0(clinical_data$GLEASON_SCORE_1, "+", clinical_data$GLEASON_SCORE_2)
clinical_data_gleason<-clinica_data_gleason %>% group_by(group) %>% dplyr::count(GLEASON_SCORE_BOTH_SITES)
gsub("NA+NA", "NA", clinical_data_gleason$GLEASON_SCORE_BOTH_SITES)## [1] "3+3" "3+4" "4+3" "4+4" "4+5" "5+3" "NA+NA" "3+3" "3+4"
## [10] "3+5" "4+3" "4+4" "4+5" "5+3" "NA+NA"
GLEASON_df<-data.frame("altered" = c(9, 15, 0, 7, 5, 8, 1, 9), "unaltered" = c(32, 38, 1, 16, 3, 3, 1, 2),
row.names = c("3+3", "3+4", "3+5", "4+3", "4+4", "4+5", "5+3", "NA+NA"))
fisher.test(GLEASON_df)##
## Fisher's Exact Test for Count Data
##
## data: GLEASON_df
## p-value = 0.0003428
## alternative hypothesis: two.sided
col_values = rev(brewer.pal(n = 8, name = "RdBu"))
col_values[8] <- alpha("grey", 0.6)
p_gleason <- ggplot(data=clinical_data_gleason,
aes(x=factor(group, levels = c("altered", "unaltered")),
y=n,
fill=GLEASON_SCORE_BOTH_SITES)) +
scale_fill_manual(labels = c("3+3", "3+4", "3+5", "4+3", "4+4", "4+5", "5+3", "NA+NA"), values=col_values) +
geom_bar(position = "fill", stat="identity", width = 0.6) +
geom_text(aes(label = n), size = 3, hjust = 0.5, vjust = 3, position = "fill") +
xlab("sample groups") +
ylab("Percentage of each sample type") +
ggtitle("Associate CIN signatures genes expression with patients PCa grades") +
theme_bw () +
theme(title = element_text(face = "bold"),
legend.title = element_text(face = "bold"),
legend.text = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"),
axis.title.x = element_text(face = "bold"),
strip.text.x = element_blank(),
strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
legend.position = "right",
panel.spacing = unit(0,'lines'))
p_gleasonlibrary(RColorBrewer)
# Associate with sample types- metastasis or primary
clinical_data_type<-clinical_data %>% group_by(group) %>% dplyr::count(SAMPLE_TYPE)
type_df<-data.frame("altered" = c(17, 37), "unaltered" = c(2, 94), row.names = c("Metastasis", "Primary"))
print(fisher.test(type_df))##
## Fisher's Exact Test for Count Data
##
## data: type_df
## p-value = 4.049e-07
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 4.661771 197.138709
## sample estimates:
## odds ratio
## 21.1432
col_values = rev(brewer.pal(n = 8, name = "RdBu"))
col_values_type<-col_values[c(6, 3)]
p_sample_type <- ggplot(data=clinical_data_type,
aes(x=factor(group, levels = c("altered", "unaltered")),
y=n,
fill=SAMPLE_TYPE)) +
scale_fill_manual(labels = c("Metastasis", "Primary"), values=col_values_type) +
geom_bar(position = "fill", stat="identity", width = 0.6) +
geom_text(aes(label = n), size = 3, hjust = 0.5, vjust = 3, position = "fill") +
xlab("sample groups") +
ylab("Percentage of each sample type") +
ggtitle("Associate CIN signatures genes expression with patients tumor type") +
theme_bw () +
theme(title = element_text(face = "bold"),
legend.title = element_text(face = "bold"),
legend.text = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"),
axis.title.x = element_text(face = "bold"),
strip.text.x = element_blank(),
strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
#strip.background =element_rect(fill="white"),
legend.position = "right",
panel.spacing = unit(0,'lines'))
p_sample_type
## Performing analyses in SU2C/DREAM TEAM 2018 PNAS (mCRPC) dataset
su2c_patient_data<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_su2c_2019/data_clinical_patient.txt", sep = "\t", header = T)
su2c_sample_data<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_su2c_2019/data_clinical_sample.txt", sep = "\t", header = T)
su2c_clinical_data<-dplyr::left_join(su2c_patient_data, su2c_sample_data, by = c("PATIENT_ID"))
duplicated_samples<-su2c_clinical_data[duplicated(su2c_clinical_data$PATIENT_ID), "SAMPLE_ID"]
su2c_zdat<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_su2c_2019/data_mrna_seq_fpkm_capture_zscores_ref_all_samples.txt", sep = "\t", header = T)
colnames(su2c_zdat)<-gsub("[.]", "-", colnames(su2c_zdat))
su2c_zdat<-su2c_zdat[, !colnames(su2c_zdat) %in% duplicated_samples]
SU2C_CIN_zscore_matrix<-list()
SU2C_CIN_clincal_matrix<-list()
for (i in 1:length(CIN.gene.set)){
CIN.name<-CIN.gene.set[i]
cn_signature_genes <- get(CIN.gene.set[i])
su2c_zdat_subset<-su2c_zdat[su2c_zdat$Hugo_Symbol %in% cn_signature_genes, c(1, which(colnames(su2c_zdat) %in% su2c_clinical_data[,"SAMPLE_ID"]))]
su2c_zdat_subset<-su2c_zdat_subset[!duplicated(su2c_zdat_subset$Hugo_Symbol),]
altered_group<-c()
for (k in 1:nrow(su2c_zdat_subset)){
sample_id<-colnames(su2c_zdat_subset[,2:ncol(su2c_zdat_subset)])[which(abs(su2c_zdat_subset[k, 2:ncol(su2c_zdat_subset)]) >= 2)]
altered_group<-c(altered_group, sample_id)
}
altered_group<-unique(altered_group)
unaltered_group<-unique(colnames(su2c_zdat_subset)[-1][!colnames(su2c_zdat_subset)[-1] %in% altered_group])
subzdat_ordered<-su2c_zdat_subset[, c(altered_group, unaltered_group)]
rownames(subzdat_ordered)<-su2c_zdat_subset$Hugo_Symbol
SU2C_CIN_zscore_matrix[[CIN.name]]<-subzdat_ordered
mrnaseq_clinical_metadata<-su2c_clinical_data[su2c_clinical_data$SAMPLE_ID %in% colnames(su2c_zdat_subset)[2:ncol(su2c_zdat_subset)],]
alt_sample_metadata<-mrnaseq_clinical_metadata[mrnaseq_clinical_metadata$SAMPLE_ID %in% unique(altered_group), ]
unalt_sample_metadata<-mrnaseq_clinical_metadata[!mrnaseq_clinical_metadata$SAMPLE_ID %in% unique(altered_group),]
alt_sample_metadata[, "group"]<-rep("altered", nrow(alt_sample_metadata))
unalt_sample_metadata[,"group"]<-rep("unaltered", nrow(unalt_sample_metadata))
clinical_data<-rbind(alt_sample_metadata, unalt_sample_metadata)
clinical_data<-clinical_data[!duplicated(clinical_data$PATIENT_ID),]
clinical_data$OS_STATUS<-as.numeric(substr(clinical_data$OS_STATUS, 1, 1))
SU2C_CIN_clincal_matrix[[CIN.name]]<-clinical_data
clinical_data_KM<-clinical_data
clinical_data_KM$OS_MONTHS <- as.numeric(clinical_data_KM$OS_MONTHS)
BCR_curve<-Surv(clinical_data_KM$OS_MONTHS, clinical_data_KM$OS_STATUS)
BCR_Sfit<-survfit(Surv(OS_MONTHS, OS_STATUS)~group, data = clinical_data_KM)
su2c_prad_relapse<-ggsurvplot(BCR_Sfit,
conf.int = F,
pval = T,
risk.table = T,
legend.labs = c("altered", "unaltered"),
legend.title = "CIN signature genes",
xlab = "Time (month)",
title = paste0("SU2C Prostate cancer patients Overall survival \n ", CIN.name, " alteration"),
palette = c("orange", "#5BC0EB"),
font.tickslab = c(15),
risk.table.height = .25)
print(su2c_prad_relapse)
}su2c_2019_clinical<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_su2c_2019/prad_su2c_2019_clinical_data_cbioportal.tsv", sep = "\t", header = T)
su2c_2019_clinical<-su2c_2019_clinical[!duplicated(su2c_2019_clinical$Patient.ID),]
su2c_clinical_tumor_burden<-left_join(clinical_data, su2c_2019_clinical, by = c("PATIENT_ID" = "Patient.ID"))
genome_altered.stat.test <- su2c_clinical_tumor_burden %>%
t_test(Fraction.Genome.Altered ~ group) %>%
add_significance()
genome_altered.stat.test## # A tibble: 1 × 9
## .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Fraction.Genome.Alt… alter… unalt… 76 125 2.00 165. 0.0476 *
p_genome_altered<-ggplot(data = su2c_clinical_tumor_burden,
aes(x = factor( group, levels = c("altered", "unaltered")),
y = Fraction.Genome.Altered,
fill = group,
color = group)) +
theme_light() +
geom_boxplot(alpha = 0.4) +
geom_jitter(shape = 16, position = position_jitter(0.2)) +
scale_fill_manual(values = c("altered" = "orange", "unaltered" = "#5BC0EB")) +
scale_color_manual(values = c("altered" = "orange", "unaltered" = "#5BC0EB")) +
stat_compare_means(label.y =1, label.x = 2, method = "t.test")+
labs(title="Associate CIN signatures genes expression with PCa genome alteration fraction", x="patient groups", y = "Fraction Genome altered")+
theme(plot.title = element_text(size=20, face = "bold"),
axis.text.x = element_text(size = 10, face = "bold"), axis.text.y = element_text(size = 10, face = "bold"),
axis.title.x = element_text(size = 15, face = "bold"), axis.title.y = element_text(size = 15, face = "bold"),
legend.text = element_text(size = 10, face = "bold"), legend.title = element_text(size = 15, face = "bold"))
p_genome_alteredlibrary(readxl)
su2c_dbgap_sample_id<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/MSI_score/phs000915.v2.p2.txt", sep = "\t", header = T)
#https://www.ncbi.nlm.nih.gov/gap/sstr/report/phs000915.v2.p2
msi_sample_id<-readxl::read_excel("/xdisk/mpadi/jiawenyang/data/centrosome_loss/MSI_score/oncotarget-08-7452-s002.xlsx", sheet = "PRAD MSS (SU2C)")
msi_scores_prad<-readxl::read_excel("/xdisk/mpadi/jiawenyang/data/centrosome_loss/MSI_score/oncotarget-08-7452-s003.xlsx", sheet = "PRAD MSS")
su2c_clinical_data<-dplyr::left_join(su2c_clinical_data, msi_scores_prad, by = c("PATIENT_ID" = "Sample"))
su2c_clinical_data<-su2c_clinical_data[su2c_clinical_data$PATIENT_ID %in% msi_sample_id$`Submitted subject id`,]Sig.CNV.seqz_cl_cells_prad_patients_with_clinical<-read.csv("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/copy_number_signatures_enrichment_results_with_clinical_info.csv")
Sig.CNV.seqz_cl_cells_prad_patients_with_clinical<-Sig.CNV.seqz_cl_cells_prad_patients_with_clinical[Sig.CNV.seqz_cl_cells_prad_patients_with_clinical$subject_id %in% su2c_clinical_data$PATIENT_ID,]
su2c_clinical_data_with_cn_sig <- Sig.CNV.seqz_cl_cells_prad_patients_with_clinical[Sig.CNV.seqz_cl_cells_prad_patients_with_clinical$subject_id %in% intersect(msi_scores_prad$Sample, Sig.CNV.seqz_cl_cells_prad_patients_with_clinical$subject_id),]
msi_scores_prad_cn_sig <- msi_scores_prad[msi_scores_prad$Sample %in% intersect(msi_scores_prad$Sample, Sig.CNV.seqz_cl_cells_prad_patients_with_clinical$subject_id), ]
msi_scores_prad_cn_sig$Sample <- as.character(msi_scores_prad_cn_sig$Sample)
msi_scores_prad_cn_sig<-left_join(msi_scores_prad_cn_sig, su2c_clinical_data_with_cn_sig, by = c("Sample" = "subject_id"))
msi_scores_prad_cn_sig$enrich_sig <- paste0("CN-", msi_scores_prad_cn_sig$enrich_sig)
cn_sig1_sig3_test<-t.test(MSISensor ~ enrich_sig, data = msi_scores_prad_cn_sig[msi_scores_prad_cn_sig$enrich_sig %in% c("CN-Sig1" ,"CN-Sig3"),])
p_cn_sig_MSI<-ggplot(data = msi_scores_prad_cn_sig,
aes(x = factor(enrich_sig, levels = c("CN-Sig1", "CN-Sig2", "CN-Sig3")),
y = MSISensor ,
fill = enrich_sig,
color = enrich_sig)) +
theme_light() +
geom_boxplot(alpha = 0.4) +
geom_jitter(shape = 16, position = position_jitter(0.2)) +
scale_fill_manual(labels = c("CN-Sig1", "CN-Sig2", "CN-Sig3"), values=c("#050506", "#DE8B36", "#CD4224")) +
scale_color_manual(labels = c("CN-Sig1", "CN-Sig2", "CN-Sig3"), values=c("#050506", "#DE8B36", "#CD4224")) +
labs(title="Associate CIN signatures genes expression with PCa sample MSI score",
x="CN-Signature groups",
y = "Microsatelight Instability (MSI) score")+
theme(plot.title = element_text(size=20, face = "bold"),
axis.text.x = element_text(size = 10, face = "bold"), axis.text.y = element_text(size = 10, face = "bold"),
axis.title.x = element_text(size = 15, face = "bold"), axis.title.y = element_text(size = 15, face = "bold"),
legend.text = element_text(size = 10, face = "bold"), legend.title = element_text(size = 15, face = "bold"))
p_cn_sig_MSI##
## Welch Two Sample t-test
##
## data: MSISensor by enrich_sig
## t = -2.2979, df = 52.653, p-value = 0.02557
## alternative hypothesis: true difference in means between group CN-Sig1 and group CN-Sig3 is not equal to 0
## 95 percent confidence interval:
## -0.44379690 -0.03008928
## sample estimates:
## mean in group CN-Sig1 mean in group CN-Sig3
## 0.1486667 0.3856098
CN_sig_score<-readRDS(file ="/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups.RDS")
CN_sig_score<-as.data.frame(t(CN_sig_score$Exposure))
CN_sig_score[,"sample"]<-rownames(CN_sig_score)
CN_sig_score[,"sample"]<-gsub("-SRR\\w+", "", CN_sig_score[,"sample"])
msi_scores_prad_cn_score<-left_join(msi_scores_prad_cn_sig, CN_sig_score, by = c("Sample" = "sample"))
msi_scores_prad_cn_score## # A tibble: 57 × 24
## Sample mSINGS MSISensor MANTIS X sample group silhouette_width enrich_sig
## <chr> <dbl> <dbl> <dbl> <int> <chr> <int> <dbl> <chr>
## 1 51150… 0.0381 0.87 0.239 176 SRR30… 1 0.992 CN-Sig3
## 2 51150… 0.0495 0 0.266 311 SRR83… 2 0.992 CN-Sig1
## 3 51150… 0.0254 0.2 0.269 310 SRR83… 2 0.992 CN-Sig1
## 4 51150… 0.0386 0 0.252 247 SRR83… 2 0.992 CN-Sig1
## 5 51150… 0.0269 0 0.265 175 SRR83… 1 0.992 CN-Sig3
## 6 51150… 0.0313 0 0.251 309 SRR83… 2 0.992 CN-Sig1
## 7 51150… 0.0263 0.25 0.257 174 SRR83… 1 0.992 CN-Sig3
## 8 51150… 0.028 0 0.256 173 SRR83… 1 0.992 CN-Sig3
## 9 51150… 0.0308 0 0.250 172 SRR83… 1 0.992 CN-Sig3
## 10 51150… 0.0331 0 0.253 171 SRR83… 1 0.992 CN-Sig3
## # ℹ 47 more rows
## # ℹ 15 more variables: tumor_Run <chr>, Study <chr>, sample_type <chr>,
## # tumor_body_site <chr>, normal_Run <chr>, PatientID <chr>, Age <int>,
## # Stage <chr>, PSA <dbl>, GleasonScore <int>, Fusion <chr>, CNV_ID <chr>,
## # Sig1 <dbl>, Sig2 <dbl>, Sig3 <dbl>
ggplot(msi_scores_prad_cn_score, aes(x=Sig3, y=MANTIS, color = MANTIS)) +
geom_point(color = "#CD4224")+
theme_light() +
geom_smooth(method=lm, color = alpha("#CD4224", 0.4)) +
stat_cor(method = "pearson", label.x = 100, label.y = 0.3 )+
xlab("CN-sig3 exposure score") +
ylab("Microsatelight Instability (MSI) score") +
ggtitle("CN-signature and MSI correlation") +
theme(plot.title = element_text(size=20, face = "bold"),
axis.text.x = element_text(size = 10, face = "bold"), axis.text.y = element_text(size = 10, face = "bold"),
axis.title.x = element_text(size = 15, face = "bold"), axis.title.y = element_text(size = 15, face = "bold"),
legend.text = element_text(size = 10, face = "bold"), legend.title = element_text(size = 15, face = "bold"))ggplot(msi_scores_prad_cn_score, aes(x=Sig2, y=MANTIS, color = MANTIS)) +
geom_point(color = "#DE8B36")+
theme_light() +
geom_smooth(method=lm, color = alpha("#DE8B36", 0.4)) +
stat_cor(method = "pearson", label.x = 10, label.y = 0.3) +
xlab("CN-sig2 exposure score") +
ylab("Microsatelight Instability (MSI) score") +
ggtitle("CN-signature and MSI correlation") +
theme(plot.title = element_text(size=20, face = "bold"),
axis.text.x = element_text(size = 10, face = "bold"), axis.text.y = element_text(size = 10, face = "bold"),
axis.title.x = element_text(size = 15, face = "bold"), axis.title.y = element_text(size = 15, face = "bold"),
legend.text = element_text(size = 10, face = "bold"), legend.title = element_text(size = 15, face = "bold"))ggplot(msi_scores_prad_cn_score, aes(x=Sig1, y=MANTIS, color = MANTIS)) +
geom_point(color = "black")+
theme_light() +
geom_smooth(method=lm, color = alpha("black", 0.4)) +
stat_cor(method = "pearson", label.x = 150, label.y = 0.3) +
xlab("CN-sig1 exposure score") +
ylab("Microsatelight Instability (MSI) score") +
ggtitle("CN-signature and MSI correlation") +
theme(plot.title = element_text(size=20, face = "bold"),
axis.text.x = element_text(size = 10, face = "bold"), axis.text.y = element_text(size = 10, face = "bold"),
axis.title.x = element_text(size = 15, face = "bold"), axis.title.y = element_text(size = 15, face = "bold"),
legend.text = element_text(size = 10, face = "bold"), legend.title = element_text(size = 15, face = "bold"))